// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_REGEXP_REGEXP_AST_H_
#define V8_REGEXP_REGEXP_AST_H_

#include "src/objects.h"
#include "src/objects/js-regexp.h"
#include "src/objects/string.h"
#include "src/utils.h"
#include "src/zone/zone-containers.h"
#include "src/zone/zone.h"

namespace v8 {
namespace internal {

#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
    VISIT(Disjunction)                    \
    VISIT(Alternative)                    \
    VISIT(Assertion)                      \
    VISIT(CharacterClass)                 \
    VISIT(Atom)                           \
    VISIT(Quantifier)                     \
    VISIT(Capture)                        \
    VISIT(Group)                          \
    VISIT(Lookaround)                     \
    VISIT(BackReference)                  \
    VISIT(Empty)                          \
    VISIT(Text)

#define FORWARD_DECLARE(Name) class RegExp##Name;
    FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
#undef FORWARD_DECLARE

    class RegExpCompiler;
    class RegExpNode;
    class RegExpTree;

    class RegExpVisitor {
    public:
        virtual ~RegExpVisitor() = default;
#define MAKE_CASE(Name) \
    virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
        FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
#undef MAKE_CASE
    };

    // A simple closed interval.
    class Interval {
    public:
        Interval()
            : from_(kNone)
            , to_(kNone)
        {
        }
        Interval(int from, int to)
            : from_(from)
            , to_(to)
        {
        }
        Interval Union(Interval that)
        {
            if (that.from_ == kNone)
                return *this;
            else if (from_ == kNone)
                return that;
            else
                return Interval(Min(from_, that.from_), Max(to_, that.to_));
        }
        bool Contains(int value) { return (from_ <= value) && (value <= to_); }
        bool is_empty() { return from_ == kNone; }
        int from() const { return from_; }
        int to() const { return to_; }
        static Interval Empty() { return Interval(); }
        static const int kNone = -1;

    private:
        int from_;
        int to_;
    };

    // Represents code units in the range from from_ to to_, both ends are
    // inclusive.
    class CharacterRange {
    public:
        CharacterRange()
            : from_(0)
            , to_(0)
        {
        }
        // For compatibility with the CHECK_OK macro
        CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
        V8_EXPORT_PRIVATE static void AddClassEscape(char type,
            ZoneList<CharacterRange>* ranges,
            Zone* zone);
        // Add class escapes. Add case equivalent closure for \w and \W if necessary.
        V8_EXPORT_PRIVATE static void AddClassEscape(
            char type, ZoneList<CharacterRange>* ranges,
            bool add_unicode_case_equivalents, Zone* zone);
        static Vector<const int> GetWordBounds();
        static inline CharacterRange Singleton(uc32 value)
        {
            return CharacterRange(value, value);
        }
        static inline CharacterRange Range(uc32 from, uc32 to)
        {
            DCHECK(0 <= from && to <= String::kMaxCodePoint);
            DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
            return CharacterRange(from, to);
        }
        static inline CharacterRange Everything()
        {
            return CharacterRange(0, String::kMaxCodePoint);
        }
        static inline ZoneList<CharacterRange>* List(Zone* zone,
            CharacterRange range)
        {
            ZoneList<CharacterRange>* list = new (zone) ZoneList<CharacterRange>(1, zone);
            list->Add(range, zone);
            return list;
        }
        bool Contains(uc32 i) { return from_ <= i && i <= to_; }
        uc32 from() const { return from_; }
        void set_from(uc32 value) { from_ = value; }
        uc32 to() const { return to_; }
        void set_to(uc32 value) { to_ = value; }
        bool is_valid() { return from_ <= to_; }
        bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; }
        bool IsSingleton() { return (from_ == to_); }
        V8_EXPORT_PRIVATE static void AddCaseEquivalents(
            Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
            bool is_one_byte);
        // Whether a range list is in canonical form: Ranges ordered by from value,
        // and ranges non-overlapping and non-adjacent.
        V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
        // Convert range list to canonical form. The characters covered by the ranges
        // will still be the same, but no character is in more than one range, and
        // adjacent ranges are merged. The resulting list may be shorter than the
        // original, but cannot be longer.
        static void Canonicalize(ZoneList<CharacterRange>* ranges);
        // Negate the contents of a character range in canonical form.
        static void Negate(ZoneList<CharacterRange>* src,
            ZoneList<CharacterRange>* dst, Zone* zone);
        static const int kStartMarker = (1 << 24);
        static const int kPayloadMask = (1 << 24) - 1;

    private:
        CharacterRange(uc32 from, uc32 to)
            : from_(from)
            , to_(to)
        {
        }

        uc32 from_;
        uc32 to_;
    };

    class CharacterSet final {
    public:
        explicit CharacterSet(uc16 standard_set_type)
            : ranges_(nullptr)
            , standard_set_type_(standard_set_type)
        {
        }
        explicit CharacterSet(ZoneList<CharacterRange>* ranges)
            : ranges_(ranges)
            , standard_set_type_(0)
        {
        }
        ZoneList<CharacterRange>* ranges(Zone* zone);
        uc16 standard_set_type() const { return standard_set_type_; }
        void set_standard_set_type(uc16 special_set_type)
        {
            standard_set_type_ = special_set_type;
        }
        bool is_standard() { return standard_set_type_ != 0; }
        V8_EXPORT_PRIVATE void Canonicalize();

    private:
        ZoneList<CharacterRange>* ranges_;
        // If non-zero, the value represents a standard set (e.g., all whitespace
        // characters) without having to expand the ranges.
        uc16 standard_set_type_;
    };

    class TextElement final {
    public:
        enum TextType { ATOM,
            CHAR_CLASS };

        static TextElement Atom(RegExpAtom* atom);
        static TextElement CharClass(RegExpCharacterClass* char_class);

        int cp_offset() const { return cp_offset_; }
        void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
        int length() const;

        TextType text_type() const { return text_type_; }

        RegExpTree* tree() const { return tree_; }

        RegExpAtom* atom() const
        {
            DCHECK(text_type() == ATOM);
            return reinterpret_cast<RegExpAtom*>(tree());
        }

        RegExpCharacterClass* char_class() const
        {
            DCHECK(text_type() == CHAR_CLASS);
            return reinterpret_cast<RegExpCharacterClass*>(tree());
        }

    private:
        TextElement(TextType text_type, RegExpTree* tree)
            : cp_offset_(-1)
            , text_type_(text_type)
            , tree_(tree)
        {
        }

        int cp_offset_;
        TextType text_type_;
        RegExpTree* tree_;
    };

    class RegExpTree : public ZoneObject {
    public:
        static const int kInfinity = kMaxInt;
        virtual ~RegExpTree() = default;
        virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
        virtual RegExpNode* ToNode(RegExpCompiler* compiler,
            RegExpNode* on_success)
            = 0;
        virtual bool IsTextElement() { return false; }
        virtual bool IsAnchoredAtStart() { return false; }
        virtual bool IsAnchoredAtEnd() { return false; }
        virtual int min_match() = 0;
        virtual int max_match() = 0;
        // Returns the interval of registers used for captures within this
        // expression.
        virtual Interval CaptureRegisters() { return Interval::Empty(); }
        virtual void AppendToText(RegExpText* text, Zone* zone);
        V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os,
            Zone* zone); // NOLINT
#define MAKE_ASTYPE(Name)             \
    virtual RegExp##Name* As##Name(); \
    virtual bool Is##Name();
        FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
#undef MAKE_ASTYPE
    };

    class RegExpDisjunction final : public RegExpTree {
    public:
        explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpDisjunction* AsDisjunction() override;
        Interval CaptureRegisters() override;
        bool IsDisjunction() override;
        bool IsAnchoredAtStart() override;
        bool IsAnchoredAtEnd() override;
        int min_match() override { return min_match_; }
        int max_match() override { return max_match_; }
        ZoneList<RegExpTree*>* alternatives() { return alternatives_; }

    private:
        bool SortConsecutiveAtoms(RegExpCompiler* compiler);
        void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
        void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
        ZoneList<RegExpTree*>* alternatives_;
        int min_match_;
        int max_match_;
    };

    class RegExpAlternative final : public RegExpTree {
    public:
        explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpAlternative* AsAlternative() override;
        Interval CaptureRegisters() override;
        bool IsAlternative() override;
        bool IsAnchoredAtStart() override;
        bool IsAnchoredAtEnd() override;
        int min_match() override { return min_match_; }
        int max_match() override { return max_match_; }
        ZoneList<RegExpTree*>* nodes() { return nodes_; }

    private:
        ZoneList<RegExpTree*>* nodes_;
        int min_match_;
        int max_match_;
    };

    class RegExpAssertion final : public RegExpTree {
    public:
        enum AssertionType {
            START_OF_LINE,
            START_OF_INPUT,
            END_OF_LINE,
            END_OF_INPUT,
            BOUNDARY,
            NON_BOUNDARY
        };
        RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
            : assertion_type_(type)
            , flags_(flags)
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpAssertion* AsAssertion() override;
        bool IsAssertion() override;
        bool IsAnchoredAtStart() override;
        bool IsAnchoredAtEnd() override;
        int min_match() override { return 0; }
        int max_match() override { return 0; }
        AssertionType assertion_type() { return assertion_type_; }

    private:
        const AssertionType assertion_type_;
        const JSRegExp::Flags flags_;
    };

    class RegExpCharacterClass final : public RegExpTree {
    public:
        // NEGATED: The character class is negated and should match everything but
        //     the specified ranges.
        // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
        //     surrogate and should not be unicode-desugared (crbug.com/641091).
        enum Flag {
            NEGATED = 1 << 0,
            CONTAINS_SPLIT_SURROGATE = 1 << 1,
        };
        using CharacterClassFlags = base::Flags<Flag>;

        RegExpCharacterClass(
            Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
            CharacterClassFlags character_class_flags = CharacterClassFlags())
            : set_(ranges)
            , flags_(flags)
            , character_class_flags_(character_class_flags)
        {
            // Convert the empty set of ranges to the negated Everything() range.
            if (ranges->is_empty()) {
                ranges->Add(CharacterRange::Everything(), zone);
                character_class_flags_ ^= NEGATED;
            }
        }
        RegExpCharacterClass(uc16 type, JSRegExp::Flags flags)
            : set_(type)
            , flags_(flags)
            , character_class_flags_(CharacterClassFlags())
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpCharacterClass* AsCharacterClass() override;
        bool IsCharacterClass() override;
        bool IsTextElement() override { return true; }
        int min_match() override { return 1; }
        // The character class may match two code units for unicode regexps.
        // TODO(yangguo): we should split this class for usage in TextElement, and
        //                make max_match() dependent on the character class content.
        int max_match() override { return 2; }
        void AppendToText(RegExpText* text, Zone* zone) override;
        CharacterSet character_set() { return set_; }
        // TODO(lrn): Remove need for complex version if is_standard that
        // recognizes a mangled standard set and just do { return set_.is_special(); }
        bool is_standard(Zone* zone);
        // Returns a value representing the standard character set if is_standard()
        // returns true.
        // Currently used values are:
        // s : unicode whitespace
        // S : unicode non-whitespace
        // w : ASCII word character (digit, letter, underscore)
        // W : non-ASCII word character
        // d : ASCII digit
        // D : non-ASCII digit
        // . : non-newline
        // * : All characters, for advancing unanchored regexp
        uc16 standard_type() const { return set_.standard_set_type(); }
        ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
        bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
        JSRegExp::Flags flags() const { return flags_; }
        bool contains_split_surrogate() const
        {
            return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
        }

    private:
        CharacterSet set_;
        const JSRegExp::Flags flags_;
        CharacterClassFlags character_class_flags_;
    };

    class RegExpAtom final : public RegExpTree {
    public:
        explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
            : data_(data)
            , flags_(flags)
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpAtom* AsAtom() override;
        bool IsAtom() override;
        bool IsTextElement() override { return true; }
        int min_match() override { return data_.length(); }
        int max_match() override { return data_.length(); }
        void AppendToText(RegExpText* text, Zone* zone) override;
        Vector<const uc16> data() { return data_; }
        int length() { return data_.length(); }
        JSRegExp::Flags flags() const { return flags_; }
        bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }

    private:
        Vector<const uc16> data_;
        const JSRegExp::Flags flags_;
    };

    class RegExpText final : public RegExpTree {
    public:
        explicit RegExpText(Zone* zone)
            : elements_(2, zone)
            , length_(0)
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpText* AsText() override;
        bool IsText() override;
        bool IsTextElement() override { return true; }
        int min_match() override { return length_; }
        int max_match() override { return length_; }
        void AppendToText(RegExpText* text, Zone* zone) override;
        void AddElement(TextElement elm, Zone* zone)
        {
            elements_.Add(elm, zone);
            length_ += elm.length();
        }
        ZoneList<TextElement>* elements() { return &elements_; }

    private:
        ZoneList<TextElement> elements_;
        int length_;
    };

    class RegExpQuantifier final : public RegExpTree {
    public:
        enum QuantifierType { GREEDY,
            NON_GREEDY,
            POSSESSIVE };
        RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
            : body_(body)
            , min_(min)
            , max_(max)
            , quantifier_type_(type)
        {
            if (min > 0 && body->min_match() > kInfinity / min) {
                min_match_ = kInfinity;
            } else {
                min_match_ = min * body->min_match();
            }
            if (max > 0 && body->max_match() > kInfinity / max) {
                max_match_ = kInfinity;
            } else {
                max_match_ = max * body->max_match();
            }
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
            RegExpCompiler* compiler, RegExpNode* on_success,
            bool not_at_start = false);
        RegExpQuantifier* AsQuantifier() override;
        Interval CaptureRegisters() override;
        bool IsQuantifier() override;
        int min_match() override { return min_match_; }
        int max_match() override { return max_match_; }
        int min() { return min_; }
        int max() { return max_; }
        bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
        bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
        bool is_greedy() { return quantifier_type_ == GREEDY; }
        RegExpTree* body() { return body_; }

    private:
        RegExpTree* body_;
        int min_;
        int max_;
        int min_match_;
        int max_match_;
        QuantifierType quantifier_type_;
    };

    class RegExpCapture final : public RegExpTree {
    public:
        explicit RegExpCapture(int index)
            : body_(nullptr)
            , index_(index)
            , name_(nullptr)
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        static RegExpNode* ToNode(RegExpTree* body, int index,
            RegExpCompiler* compiler, RegExpNode* on_success);
        RegExpCapture* AsCapture() override;
        bool IsAnchoredAtStart() override;
        bool IsAnchoredAtEnd() override;
        Interval CaptureRegisters() override;
        bool IsCapture() override;
        int min_match() override { return body_->min_match(); }
        int max_match() override { return body_->max_match(); }
        RegExpTree* body() { return body_; }
        void set_body(RegExpTree* body) { body_ = body; }
        int index() { return index_; }
        const ZoneVector<uc16>* name() const { return name_; }
        void set_name(const ZoneVector<uc16>* name) { name_ = name; }
        static int StartRegister(int index) { return index * 2; }
        static int EndRegister(int index) { return index * 2 + 1; }

    private:
        RegExpTree* body_;
        int index_;
        const ZoneVector<uc16>* name_;
    };

    class RegExpGroup final : public RegExpTree {
    public:
        explicit RegExpGroup(RegExpTree* body)
            : body_(body)
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler,
            RegExpNode* on_success) override
        {
            return body_->ToNode(compiler, on_success);
        }
        RegExpGroup* AsGroup() override;
        bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
        bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
        bool IsGroup() override;
        int min_match() override { return body_->min_match(); }
        int max_match() override { return body_->max_match(); }
        Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
        RegExpTree* body() { return body_; }

    private:
        RegExpTree* body_;
    };

    class RegExpLookaround final : public RegExpTree {
    public:
        enum Type { LOOKAHEAD,
            LOOKBEHIND };

        RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
            int capture_from, Type type)
            : body_(body)
            , is_positive_(is_positive)
            , capture_count_(capture_count)
            , capture_from_(capture_from)
            , type_(type)
        {
        }

        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpLookaround* AsLookaround() override;
        Interval CaptureRegisters() override;
        bool IsLookaround() override;
        bool IsAnchoredAtStart() override;
        int min_match() override { return 0; }
        int max_match() override { return 0; }
        RegExpTree* body() { return body_; }
        bool is_positive() { return is_positive_; }
        int capture_count() { return capture_count_; }
        int capture_from() { return capture_from_; }
        Type type() { return type_; }

        class Builder {
        public:
            Builder(bool is_positive, RegExpNode* on_success,
                int stack_pointer_register, int position_register,
                int capture_register_count = 0, int capture_register_start = 0);
            RegExpNode* on_match_success() { return on_match_success_; }
            RegExpNode* ForMatch(RegExpNode* match);

        private:
            bool is_positive_;
            RegExpNode* on_match_success_;
            RegExpNode* on_success_;
            int stack_pointer_register_;
            int position_register_;
        };

    private:
        RegExpTree* body_;
        bool is_positive_;
        int capture_count_;
        int capture_from_;
        Type type_;
    };

    class RegExpBackReference final : public RegExpTree {
    public:
        explicit RegExpBackReference(JSRegExp::Flags flags)
            : capture_(nullptr)
            , name_(nullptr)
            , flags_(flags)
        {
        }
        RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags)
            : capture_(capture)
            , name_(nullptr)
            , flags_(flags)
        {
        }
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpBackReference* AsBackReference() override;
        bool IsBackReference() override;
        int min_match() override { return 0; }
        // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
        // recursion, we give up. Ignorance is bliss.
        int max_match() override { return kInfinity; }
        int index() { return capture_->index(); }
        RegExpCapture* capture() { return capture_; }
        void set_capture(RegExpCapture* capture) { capture_ = capture; }
        const ZoneVector<uc16>* name() const { return name_; }
        void set_name(const ZoneVector<uc16>* name) { name_ = name; }

    private:
        RegExpCapture* capture_;
        const ZoneVector<uc16>* name_;
        const JSRegExp::Flags flags_;
    };

    class RegExpEmpty final : public RegExpTree {
    public:
        RegExpEmpty() = default;
        void* Accept(RegExpVisitor* visitor, void* data) override;
        RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
        RegExpEmpty* AsEmpty() override;
        bool IsEmpty() override;
        int min_match() override { return 0; }
        int max_match() override { return 0; }
    };

} // namespace internal
} // namespace v8

#endif // V8_REGEXP_REGEXP_AST_H_
